library(readr)
library(MVA)
## Loading required package: HSAUR2
## Loading required package: tools
library(HSAUR2)
library(SciViews)
library(scatterplot3d)
library(car)
## Loading required package: carData
library(lattice)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
library(ggridges)
library(ggvis)
##
## Attaching package: 'ggvis'
## The following object is masked from 'package:ggplot2':
##
## resolution
library(ggthemes)
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggthemes':
##
## theme_map
library(gapminder)
library(gganimate)
## No renderer backend detected. gganimate will default to writing frames to separate files
## Consider installing:
## - the `gifski` package for gif output
## - the `av` package for video output
## and restarting the R session
##
## Attaching package: 'gganimate'
## The following object is masked from 'package:ggvis':
##
## view_static
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ tibble 3.1.8 ✔ stringr 1.5.0
## ✔ tidyr 1.3.0 ✔ forcats 1.0.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ ggvis::resolution() masks ggplot2::resolution()
## ✖ purrr::some() masks car::some()
library(grid)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(RColorBrewer)
library(Hotelling)
## Loading required package: corpcor
##
## Attaching package: 'Hotelling'
##
## The following object is masked from 'package:dplyr':
##
## summarise
library(stats)
library(biotools)
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## ---
## biotools version 4.2
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
library(ggfortify)
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
## The following object is masked from 'package:car':
##
## logit
library(corrplot)
## corrplot 0.92 loaded
pdatas <- read_csv("~/Downloads/Protein_Consumption.csv")
## Rows: 25 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Country
## dbl (10): Red Meat, White Meat, Egg, Milk, Fish, Cereals, Starchy Foods, Pul...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(pdatas)
str(pdatas)
## spc_tbl_ [25 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Country : chr [1:25] "Albania" "Austria" "Belgium" "Bulgaria" ...
## $ Red Meat : num [1:25] 10 9 14 8 10 11 8 10 18 10 ...
## $ White Meat : num [1:25] 1 14 9 6 11 11 12 5 10 3 ...
## $ Egg : num [1:25] 1 4 4 2 3 4 4 3 3 3 ...
## $ Milk : num [1:25] 9 20 18 8 13 25 11 34 20 18 ...
## $ Fish : num [1:25] 0 2 5 1 2 10 5 6 6 6 ...
## $ Cereals : num [1:25] 42 28 27 57 34 22 25 26 28 42 ...
## $ Starchy Foods : num [1:25] 1 4 6 1 5 5 7 5 5 2 ...
## $ Pulses Nuts and Oilseeds: num [1:25] 6 1 2 4 1 1 1 1 2 8 ...
## $ Fruits and Vegetables : num [1:25] 2 4 4 4 4 2 4 1 7 7 ...
## $ Total : num [1:25] 72 86 89 91 83 91 77 91 99 99 ...
## - attr(*, "spec")=
## .. cols(
## .. Country = col_character(),
## .. `Red Meat` = col_double(),
## .. `White Meat` = col_double(),
## .. Egg = col_double(),
## .. Milk = col_double(),
## .. Fish = col_double(),
## .. Cereals = col_double(),
## .. `Starchy Foods` = col_double(),
## .. `Pulses Nuts and Oilseeds` = col_double(),
## .. `Fruits and Vegetables` = col_double(),
## .. Total = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
attach(pdatas)
#Get the Correlations between the measurements
cor(pdatas[-1])
## Red Meat White Meat Egg Milk
## Red Meat 1.00000000 0.18850977 0.57532001 0.5440251
## White Meat 0.18850977 1.00000000 0.60095535 0.2974816
## Egg 0.57532001 0.60095535 1.00000000 0.6130310
## Milk 0.54402512 0.29748163 0.61303102 1.0000000
## Fish 0.06491072 -0.19719960 0.04780844 0.1624624
## Cereals -0.50970337 -0.43941908 -0.70131040 -0.5924925
## Starchy Foods 0.15383673 0.33456770 0.41266333 0.2144917
## Pulses Nuts and Oilseeds -0.40988882 -0.67214885 -0.59519381 -0.6238357
## Fruits and Vegetables -0.06393465 -0.07329308 -0.16392249 -0.3997753
## Total 0.37369919 0.10308602 0.18970028 0.4603542
## Fish Cereals Starchy Foods
## Red Meat 0.06491072 -0.50970337 0.15383673
## White Meat -0.19719960 -0.43941908 0.33456770
## Egg 0.04780844 -0.70131040 0.41266333
## Milk 0.16246239 -0.59249246 0.21449173
## Fish 1.00000000 -0.51714759 0.43868411
## Cereals -0.51714759 1.00000000 -0.57813449
## Starchy Foods 0.43868411 -0.57813449 1.00000000
## Pulses Nuts and Oilseeds -0.12226043 0.63605948 -0.49518800
## Fruits and Vegetables 0.22948842 0.04229293 0.06835670
## Total -0.09089592 0.18587578 -0.04418245
## Pulses Nuts and Oilseeds Fruits and Vegetables
## Red Meat -0.4098888 -0.06393465
## White Meat -0.6721488 -0.07329308
## Egg -0.5951938 -0.16392249
## Milk -0.6238357 -0.39977527
## Fish -0.1222604 0.22948842
## Cereals 0.6360595 0.04229293
## Starchy Foods -0.4951880 0.06835670
## Pulses Nuts and Oilseeds 1.0000000 0.35133227
## Fruits and Vegetables 0.3513323 1.00000000
## Total -0.0812251 0.07201466
## Total
## Red Meat 0.37369919
## White Meat 0.10308602
## Egg 0.18970028
## Milk 0.46035417
## Fish -0.09089592
## Cereals 0.18587578
## Starchy Foods -0.04418245
## Pulses Nuts and Oilseeds -0.08122510
## Fruits and Vegetables 0.07201466
## Total 1.00000000
pdata_pca <- prcomp(pdatas[,-1],scale=TRUE)
pdata_pca
## Standard deviations (1, .., p=10):
## [1] 2.032257e+00 1.319067e+00 1.144237e+00 1.021544e+00 8.360847e-01
## [6] 6.531975e-01 5.841454e-01 4.366348e-01 3.458098e-01 6.274404e-16
##
## Rotation (n x k) = (10 x 10):
## PC1 PC2 PC3 PC4
## Red Meat -0.3180769 -0.17809245 -0.38142753 -0.039766137
## White Meat -0.3140588 -0.11783853 0.36420271 0.538507972
## Egg -0.4202281 -0.08236350 0.02047575 0.155623651
## Milk -0.3870300 -0.23356182 -0.19997405 -0.320360929
## Fish -0.1271598 0.57388821 -0.33003267 -0.304161366
## Cereals 0.4177240 -0.31321549 -0.02354236 0.104798477
## Starchy Foods -0.2880798 0.41038324 0.05768490 0.150709175
## Pulses Nuts and Oilseeds 0.4177658 0.04145202 -0.24796403 0.008042093
## Fruits and Vegetables 0.1197680 0.34858202 -0.41210384 0.643455476
## Total -0.1062294 -0.41709540 -0.58081103 0.203145847
## PC5 PC6 PC7 PC8
## Red Meat 0.53138781 -0.393811788 0.42940825 -0.1592276
## White Meat -0.09760147 0.309417061 0.09254681 -0.2919567
## Egg 0.26932734 -0.059357751 -0.63995627 -0.2652806
## Milk -0.15848975 0.307976584 -0.17405921 0.5444724
## Fish -0.20323386 0.303075844 0.06315829 -0.5200308
## Cereals -0.29201244 -0.196460437 0.06971238 -0.2001491
## Starchy Foods -0.42198545 -0.680457657 -0.11769041 0.1889672
## Pulses Nuts and Oilseeds 0.22507285 -0.087921207 -0.57816932 -0.0829400
## Fruits and Vegetables 0.16834367 0.222568384 0.08684392 0.3701826
## Total -0.47623561 -0.007702046 -0.05178373 -0.1801923
## PC9 PC10
## Red Meat -0.17150487 0.20838019
## White Meat -0.46186736 0.22903415
## Egg 0.48098579 0.06827056
## Milk -0.13218960 0.43456461
## Fish 0.01789764 0.21247753
## Cereals 0.30436394 0.67412235
## Starchy Foods -0.14706957 0.10134794
## Pulses Nuts and Oilseeds -0.58938418 0.12362100
## Fruits and Vegetables 0.20995988 0.11723988
## Total -0.04898111 -0.41440004
summary(pdata_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## Standard deviation 2.032 1.319 1.1442 1.0215 0.8361 0.65320 0.58415 0.43663
## Proportion of Variance 0.413 0.174 0.1309 0.1044 0.0699 0.04267 0.03412 0.01906
## Cumulative Proportion 0.413 0.587 0.7179 0.8223 0.8922 0.93485 0.96898 0.98804
## PC9 PC10
## Standard deviation 0.34581 6.274e-16
## Proportion of Variance 0.01196 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00
#For this pdata_pca object, the output of summary(sparrows_pca) shows that the first principal component (PC1) explains 41.30% of the total variance in the data, the second PC explains an additional 17.4%, and so on.
#The cumulative proportion shows the total amount of variance explained by each PC in sequence, PC1 explains the most variance and subsequent PCs explain progressively less. In this case, the first two PCs explain 82.95% of the total variance, the first three PCs explain 90.68%, the first eight PCs explain 98.80%, and 9 PCs together explain ~100% of the total variance in the data.
eigen_pdata <- pdata_pca$sdev^2
eigen_pdata
## [1] 4.130067e+00 1.739939e+00 1.309278e+00 1.043551e+00 6.990377e-01
## [6] 4.266669e-01 3.412258e-01 1.906500e-01 1.195844e-01 3.936815e-31
#these values show how much the transformation "stretches" or "compresses" the data. The direction in which the transformation stretches the most (4.130067e+00) is matched by the direction with the smallest eigenvalue (3.936815e-31), and vice versa.
names(eigen_pdata) <- paste("PC",1:10,sep="")
sumlambdas <- sum(eigen_pdata)
sumlambdas
## [1] 10
propvar <- eigen_pdata/sumlambdas
propvar
## PC1 PC2 PC3 PC4 PC5 PC6
## 4.130067e-01 1.739939e-01 1.309278e-01 1.043551e-01 6.990377e-02 4.266669e-02
## PC7 PC8 PC9 PC10
## 3.412258e-02 1.906500e-02 1.195844e-02 3.936815e-32
#The amount of variation that each principal component explains is stored in the propvar variable.
cumvar_pdatas <- cumsum(propvar)
cumvar_pdatas
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## 0.4130067 0.5870006 0.7179284 0.8222835 0.8921873 0.9348540 0.9689766 0.9880416
## PC9 PC10
## 1.0000000 1.0000000
#Based on the values in cumvar_pdatas, we can see that:
#PC1 explains 41.30% of the total variance in the data set.
#PC1 and PC2 combined explain 58.70% of the total variance in the data set.
#PC1 to PC3 combined explain 71.79% of the total variance in the data set.
#PC1 to PC4 combined explain 82.23% of the total variance in the data set.
#PC1 to PC5 combined explain 89.22% of the total variance in the data set.
#PC1 to PC6 combined explain 93.49% of the total variance in the data set.
#PC1 to PC7 combined explain 96.90% of the total variance in the data set.
#PC1 to PC8 combined explain 98.80% of the total variance in the data set.
#PC1 to PC9 combined explain 100% of the total variance in the data set.
#PC1 to PC10 combined also explain 100% of the total variance in the data set.
matlambdas <- rbind(eigen_pdata,propvar,cumvar_pdatas)
rownames(matlambdas) <- c("Eigenvalues","Prop. variance","Cum. prop. variance")
round(matlambdas,4)
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## Eigenvalues 4.1301 1.7399 1.3093 1.0436 0.6990 0.4267 0.3412 0.1906
## Prop. variance 0.4130 0.1740 0.1309 0.1044 0.0699 0.0427 0.0341 0.0191
## Cum. prop. variance 0.4130 0.5870 0.7179 0.8223 0.8922 0.9349 0.9690 0.9880
## PC9 PC10
## Eigenvalues 0.1196 0
## Prop. variance 0.0120 0
## Cum. prop. variance 1.0000 1
summary(pdata_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## Standard deviation 2.032 1.319 1.1442 1.0215 0.8361 0.65320 0.58415 0.43663
## Proportion of Variance 0.413 0.174 0.1309 0.1044 0.0699 0.04267 0.03412 0.01906
## Cumulative Proportion 0.413 0.587 0.7179 0.8223 0.8922 0.93485 0.96898 0.98804
## PC9 PC10
## Standard deviation 0.34581 6.274e-16
## Proportion of Variance 0.01196 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00
#The first principal component (PC1) has the highest standard deviation and explains 41.3% of the variation in the data. The second and third principal components (PC2 and PC3) also explain a lot of the variation (17.4% and 13.1%, respectively). The first three principal components explain 71.8% of all the differences in the data as a whole.The output also shows that the last two principal components, PC9 and PC10, have very low standard deviations and don't explain much variation. In fact, PC10 doesn't explain nearly any of the difference (0.000e+00). This suggests that these parts might not be the best way to show the data.
pdata_pca$rotation
## PC1 PC2 PC3 PC4
## Red Meat -0.3180769 -0.17809245 -0.38142753 -0.039766137
## White Meat -0.3140588 -0.11783853 0.36420271 0.538507972
## Egg -0.4202281 -0.08236350 0.02047575 0.155623651
## Milk -0.3870300 -0.23356182 -0.19997405 -0.320360929
## Fish -0.1271598 0.57388821 -0.33003267 -0.304161366
## Cereals 0.4177240 -0.31321549 -0.02354236 0.104798477
## Starchy Foods -0.2880798 0.41038324 0.05768490 0.150709175
## Pulses Nuts and Oilseeds 0.4177658 0.04145202 -0.24796403 0.008042093
## Fruits and Vegetables 0.1197680 0.34858202 -0.41210384 0.643455476
## Total -0.1062294 -0.41709540 -0.58081103 0.203145847
## PC5 PC6 PC7 PC8
## Red Meat 0.53138781 -0.393811788 0.42940825 -0.1592276
## White Meat -0.09760147 0.309417061 0.09254681 -0.2919567
## Egg 0.26932734 -0.059357751 -0.63995627 -0.2652806
## Milk -0.15848975 0.307976584 -0.17405921 0.5444724
## Fish -0.20323386 0.303075844 0.06315829 -0.5200308
## Cereals -0.29201244 -0.196460437 0.06971238 -0.2001491
## Starchy Foods -0.42198545 -0.680457657 -0.11769041 0.1889672
## Pulses Nuts and Oilseeds 0.22507285 -0.087921207 -0.57816932 -0.0829400
## Fruits and Vegetables 0.16834367 0.222568384 0.08684392 0.3701826
## Total -0.47623561 -0.007702046 -0.05178373 -0.1801923
## PC9 PC10
## Red Meat -0.17150487 0.20838019
## White Meat -0.46186736 0.22903415
## Egg 0.48098579 0.06827056
## Milk -0.13218960 0.43456461
## Fish 0.01789764 0.21247753
## Cereals 0.30436394 0.67412235
## Starchy Foods -0.14706957 0.10134794
## Pulses Nuts and Oilseeds -0.58938418 0.12362100
## Fruits and Vegetables 0.20995988 0.11723988
## Total -0.04898111 -0.41440004
#Cereals and pulses/nuts/oilseeds have large positive weights for the first principal component (PC1), while red meat, white meat, egg, and milk have negative weights. This indicates that PC1 may be associated with a plant-based diet.
#High positive weights are assigned to fish, fruits and vegetables, and white meat on the second principal component (PC2), while red meat and overall have negative weights. This indicates that PC2 is associated with a diet consisting of fish, fruits and vegetables, and white meat, but not red meat.
#The third principal component (PC3) gives white meat high positive weights and fruits and vegetables negative weights. This shows that PC3 is associated with a diet rich in white meat but lacking in fruits and vegetables.
#On the fourth principal component (PC4), fruits and vegetables and starchy foods have high positive weights, while red meat and fish have negative weights. This indicates that PC4 is associated with a diet consisting of fruits, vegetables, and starchy foods, but not red meat and fish.
#The remaining principal components (PC5 to PC10) have extremely modest variations and hence account for a negligible portion of the total data variation.
print(pdata_pca)
## Standard deviations (1, .., p=10):
## [1] 2.032257e+00 1.319067e+00 1.144237e+00 1.021544e+00 8.360847e-01
## [6] 6.531975e-01 5.841454e-01 4.366348e-01 3.458098e-01 6.274404e-16
##
## Rotation (n x k) = (10 x 10):
## PC1 PC2 PC3 PC4
## Red Meat -0.3180769 -0.17809245 -0.38142753 -0.039766137
## White Meat -0.3140588 -0.11783853 0.36420271 0.538507972
## Egg -0.4202281 -0.08236350 0.02047575 0.155623651
## Milk -0.3870300 -0.23356182 -0.19997405 -0.320360929
## Fish -0.1271598 0.57388821 -0.33003267 -0.304161366
## Cereals 0.4177240 -0.31321549 -0.02354236 0.104798477
## Starchy Foods -0.2880798 0.41038324 0.05768490 0.150709175
## Pulses Nuts and Oilseeds 0.4177658 0.04145202 -0.24796403 0.008042093
## Fruits and Vegetables 0.1197680 0.34858202 -0.41210384 0.643455476
## Total -0.1062294 -0.41709540 -0.58081103 0.203145847
## PC5 PC6 PC7 PC8
## Red Meat 0.53138781 -0.393811788 0.42940825 -0.1592276
## White Meat -0.09760147 0.309417061 0.09254681 -0.2919567
## Egg 0.26932734 -0.059357751 -0.63995627 -0.2652806
## Milk -0.15848975 0.307976584 -0.17405921 0.5444724
## Fish -0.20323386 0.303075844 0.06315829 -0.5200308
## Cereals -0.29201244 -0.196460437 0.06971238 -0.2001491
## Starchy Foods -0.42198545 -0.680457657 -0.11769041 0.1889672
## Pulses Nuts and Oilseeds 0.22507285 -0.087921207 -0.57816932 -0.0829400
## Fruits and Vegetables 0.16834367 0.222568384 0.08684392 0.3701826
## Total -0.47623561 -0.007702046 -0.05178373 -0.1801923
## PC9 PC10
## Red Meat -0.17150487 0.20838019
## White Meat -0.46186736 0.22903415
## Egg 0.48098579 0.06827056
## Milk -0.13218960 0.43456461
## Fish 0.01789764 0.21247753
## Cereals 0.30436394 0.67412235
## Starchy Foods -0.14706957 0.10134794
## Pulses Nuts and Oilseeds -0.58938418 0.12362100
## Fruits and Vegetables 0.20995988 0.11723988
## Total -0.04898111 -0.41440004
pdata_pca$x
## PC1 PC2 PC3 PC4 PC5 PC6
## [1,] 3.5978397 -0.64061101 1.1118946 -1.91119245 1.884437106 -0.37593345
## [2,] -1.3862854 -0.70991905 1.1613381 0.93107494 -0.009121937 0.75816906
## [3,] -1.6608482 0.10781730 -0.4231894 0.24680766 0.188016546 -0.91001548
## [4,] 2.9881523 -1.84361307 -0.0730564 0.30616165 -0.134812297 0.29005421
## [5,] -0.3686147 -0.10141825 1.2155042 0.72202089 -0.062918010 -0.37091750
## [6,] -2.4923551 0.18474749 -0.2075253 -0.93906831 -0.822177041 0.65204948
## [7,] -1.2387459 1.58140979 1.9302394 0.77259151 -0.139755937 -0.58954056
## [8,] -1.7732789 -0.75352175 -0.3644876 -2.28429396 -1.224019848 0.17828822
## [9,] -1.6448018 -0.30606640 -2.4846910 1.25325810 0.230223125 -0.33223855
## [10,] 2.0943234 -0.61997417 -3.0846378 0.31332068 0.270784604 0.64981699
## [11,] 1.4808993 -0.43978564 1.6090270 1.21709297 -0.143865961 0.11534733
## [12,] -2.6714332 -1.03848419 -0.2833724 0.15763312 0.181076517 -0.86151844
## [13,] 1.5660043 -0.01064018 -0.5907111 0.54266246 1.069631810 0.77586008
## [14,] -1.7006997 -0.50438298 0.7596605 0.64321026 0.292062273 0.92348043
## [15,] -0.8828201 1.28521025 -0.1832152 -1.71931314 -0.439007528 0.41757899
## [16,] -0.2286613 0.19642466 -0.4058046 1.67696384 -1.334150980 0.08818598
## [17,] 2.0912590 4.41252506 -0.6718598 -0.03434506 -0.291193444 0.33278906
## [18,] 2.6049767 -1.05771521 0.5868844 -0.14252039 -0.533268313 -0.20083289
## [19,] 1.5709389 2.67472726 -0.2892457 0.23912301 0.594881631 -0.60647031
## [20,] -1.8343339 0.36443676 0.5444138 -1.56417414 0.158327086 0.80195706
## [21,] -0.9293183 -0.96269089 -0.3476755 0.27836268 0.755554148 0.70844461
## [22,] -1.9728952 -0.55508144 -0.8727628 -0.60997694 1.396218668 -1.20971357
## [23,] 0.7660628 -0.48463412 -0.2720099 -0.40950179 -1.470304012 -1.24044252
## [24,] -1.6857673 0.30943116 1.2190705 0.55052071 0.810416131 0.20076819
## [25,] 3.7104025 -1.08819138 0.4162119 -0.23641829 -1.227034337 -0.19516642
## PC7 PC8 PC9 PC10
## [1,] 0.6467777066 0.308209567 -0.344610598 3.383713e-16
## [2,] 0.0005093868 -0.012933034 0.124176638 -7.509950e-16
## [3,] 0.1534640851 -0.334041295 0.023323758 -6.583450e-16
## [4,] 0.5999541449 -0.762640350 0.674235551 -1.353251e-15
## [5,] 0.7878924305 -0.039689570 0.241927022 -6.505439e-16
## [6,] -0.0364433564 -0.984127670 -0.168254146 -6.316059e-16
## [7,] -0.0632650200 -0.313388346 0.320254182 -1.165347e-15
## [8,] -0.0506617637 0.792618282 0.004268287 1.455502e-16
## [9,] 1.3629405718 -0.176345585 -0.392094989 -8.056029e-17
## [10,] -1.1867279230 -0.252605939 -0.185325024 -1.079761e-15
## [11,] -0.8173673169 -0.201792286 -0.496946360 -8.170773e-16
## [12,] -0.7338089555 0.194588527 -0.047542669 -5.234677e-16
## [13,] 0.0085984337 0.435335074 0.815121519 -1.291806e-15
## [14,] -0.2530352518 0.088559649 -0.434700410 -2.826196e-16
## [15,] 0.0122896156 0.009259812 0.182509788 -5.117660e-16
## [16,] -0.0295375727 0.839590880 0.341088667 -1.071960e-15
## [17,] 0.6466024099 -0.205548666 -0.304794550 -7.833882e-16
## [18,] -0.2135771460 -0.211277632 -0.024663621 -6.965860e-16
## [19,] -0.9520057576 0.408309790 0.166895175 -1.276369e-15
## [20,] -0.1459371778 -0.241698086 0.340921291 -5.136336e-16
## [21,] 0.6841927749 0.678069260 -0.252544924 1.910841e-16
## [22,] -0.4798955917 -0.365578790 0.224480622 -7.803610e-16
## [23,] 0.3126133335 0.287576242 -0.038575860 -5.234677e-16
## [24,] -0.0977006735 0.141407912 -0.415483582 -2.858208e-16
## [25,] -0.1558713867 -0.081857747 -0.353665768 -5.750783e-16
var <- get_pca_var(pdata_pca)
var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
#plot(eigen_pdata, xlab = "Component number", ylab = "Component variance", type = "l", main = "Scree diagram")
#We can see that in the scree plot above elbow is forming at 5 hence first 4 components should be kept for the analysis.
# Correlation
pairs.panels(pdatas[,-11],
gap = 0,
bg = c("red", "blue")[pdatas$Country],
pch=21)

diag(cov(pdata_pca$x))
## PC1 PC2 PC3 PC4 PC5 PC6
## 4.130067e+00 1.739939e+00 1.309278e+00 1.043551e+00 6.990377e-01 4.266669e-01
## PC7 PC8 PC9 PC10
## 3.412258e-01 1.906500e-01 1.195844e-01 2.071493e-31
xlim <- range(pdata_pca$x[,1])
plot(pdata_pca$x,xlim=xlim,ylim=xlim)

library(vctrs)
##
## Attaching package: 'vctrs'
##
## The following object is masked from 'package:tibble':
##
## data_frame
##
## The following object is masked from 'package:dplyr':
##
## data_frame
library(purrr)
#Correlation circle
head(var$coord, 4)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Red Meat -0.6464140 -0.2349159 -0.43644348 -0.04062284 0.44428523
## White Meat -0.6382482 -0.1554370 0.41673420 0.55010936 -0.08160309
## Egg -0.8540114 -0.1086430 0.02342911 0.15897634 0.22518048
## Milk -0.7865443 -0.3080838 -0.22881770 -0.32726265 -0.13251086
## Dim.6 Dim.7 Dim.8 Dim.9 Dim.10
## Red Meat -0.25723686 0.25083684 -0.06952433 -0.05930806 1.307462e-16
## White Meat 0.20211044 0.05406079 -0.12747846 -0.15971825 1.437053e-16
## Egg -0.03877233 -0.37382749 -0.11583077 0.16632959 4.283571e-17
## Milk 0.20116953 -0.10167588 0.23773563 -0.04571246 2.726634e-16
fviz_eig(pdata_pca, addlabels = TRUE)

fviz_pca_var(pdata_pca,col.var = "cos2",
gradient.cols = c("#FFCC00", "#CC9933", "#660033", "#330033"),
repel = TRUE)

#We can use this graph to interpret contribuition of each varible to PCs obtained from PCA. Also we can see that White Meat, Milk, Egg and Red Meats are highly correlated to each other just like Starchy Food and Fish, Cereals and Pulses are.
biplot(pdata_pca)

#Quality of representation
head(var$cos2, 4)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Red Meat 0.4178510 0.05518550 0.1904829088 0.001650215 0.197389362
## White Meat 0.4073607 0.02416065 0.1736673957 0.302620310 0.006659065
## Egg 0.7293355 0.01180330 0.0005489233 0.025273477 0.050706246
## Milk 0.6186519 0.09491560 0.0523575399 0.107100842 0.017559128
## Dim.6 Dim.7 Dim.8 Dim.9 Dim.10
## Red Meat 0.066170804 0.062919120 0.004833632 0.003517446 1.709456e-32
## White Meat 0.040848631 0.002922569 0.016250757 0.025509919 2.065121e-32
## Egg 0.001503294 0.139746989 0.013416767 0.027665532 1.834898e-33
## Milk 0.040469178 0.010337985 0.056518230 0.002089629 7.434534e-32
corrplot(var$cos2, is.corr=FALSE)

# Color by cos2 values: quality on the factor map
fviz_pca_var(pdata_pca, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)

#Contributions of variables to PCs
head(var$contrib, 4)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
## Red Meat 10.117293 3.1716922 14.54869623 0.1581346 28.2373005 15.5087724
## White Meat 9.863295 1.3885919 13.26436162 28.9990836 0.9526046 9.5738918
## Egg 17.659167 0.6783745 0.04192564 2.4218721 7.2537218 0.3523343
## Milk 14.979220 5.4551123 3.99896215 10.2631125 2.5119001 9.4849577
## Dim.7 Dim.8 Dim.9 Dim.10
## Red Meat 18.4391446 2.535344 2.941392 4.3422302
## White Meat 0.8564913 8.523871 21.332145 5.2456640
## Egg 40.9544024 7.037382 23.134733 0.4660869
## Milk 3.0296610 29.645024 1.747409 18.8846404
library("corrplot")
corrplot(var$contrib, is.corr=FALSE)

fviz_pca_var(pdata_pca, col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
)

# Contributions of variables to PC1
fviz_contrib(pdata_pca, choice = "var", axes = 1, top = 10)

# Contributions of variables to PC2
fviz_contrib(pdata_pca, choice = "var", axes = 2, top = 10)

fviz_contrib(pdata_pca, choice = "var", axes = 1:2, top = 10)

#Graphs of individuals
ind <- get_pca_ind(pdata_pca)
ind
## Principal Component Analysis Results for individuals
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the individuals"
## 2 "$cos2" "Cos2 for the individuals"
## 3 "$contrib" "contributions of the individuals"
# Coordinates of individuals
head(ind$coord)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
## 1 3.5978397 -0.6406110 1.1118946 -1.9111924 1.884437106 -0.3759335
## 2 -1.3862854 -0.7099190 1.1613381 0.9310749 -0.009121937 0.7581691
## 3 -1.6608482 0.1078173 -0.4231894 0.2468077 0.188016546 -0.9100155
## 4 2.9881523 -1.8436131 -0.0730564 0.3061617 -0.134812297 0.2900542
## 5 -0.3686147 -0.1014183 1.2155042 0.7220209 -0.062918010 -0.3709175
## 6 -2.4923551 0.1847475 -0.2075253 -0.9390683 -0.822177041 0.6520495
## Dim.7 Dim.8 Dim.9 Dim.10
## 1 0.6467777066 0.30820957 -0.34461060 3.383713e-16
## 2 0.0005093868 -0.01293303 0.12417664 -7.509950e-16
## 3 0.1534640851 -0.33404129 0.02332376 -6.583450e-16
## 4 0.5999541449 -0.76264035 0.67423555 -1.353251e-15
## 5 0.7878924305 -0.03968957 0.24192702 -6.505439e-16
## 6 -0.0364433564 -0.98412767 -0.16825415 -6.316059e-16
# Quality of individuals
head(ind$cos2)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
## 1 0.57356783 0.018184023 0.0547808040 0.161848997 0.1573491742 0.006262145
## 2 0.36732320 0.096329817 0.2577866362 0.165696126 0.0000159044 0.109869006
## 3 0.68802143 0.002899472 0.0446695628 0.015193553 0.0088172714 0.206556862
## 4 0.64120168 0.244078067 0.0003832708 0.006731174 0.0013051133 0.006041539
## 5 0.04579072 0.003466286 0.4979033388 0.175683761 0.0013340805 0.046364625
## 6 0.67008846 0.003681878 0.0046457326 0.095127628 0.0729193825 0.045864155
## Dim.7 Dim.8 Dim.9 Dim.10
## 1 1.853580e-02 4.209140e-03 0.0052620919 5.073271e-33
## 2 4.959507e-08 3.197009e-05 0.0029472900 1.077996e-31
## 3 5.874293e-03 2.783187e-02 0.0001356873 1.081058e-31
## 4 2.584790e-02 4.176656e-02 0.0326446860 1.315061e-31
## 5 2.092021e-01 5.308660e-04 0.0197242663 1.426216e-31
## 6 1.432679e-04 1.044757e-01 0.0030538228 4.303330e-32
# Contributions of individuals
head(ind$contrib)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
## 1 12.5367939 0.94344126 3.77707215 14.0008703 2.031995e+01 1.3249300
## 2 1.8612648 1.15862723 4.12045687 3.3228862 4.761388e-04 5.3889372
## 3 2.6715467 0.02672409 0.54713902 0.2334874 2.022794e-01 7.7636967
## 4 8.6478536 7.81386010 0.01630589 0.3592922 1.039964e-01 0.7887318
## 5 0.1315977 0.02364603 4.51378629 1.9982311 2.265215e-02 1.2898098
## 6 6.0162063 0.07846630 0.13157397 3.3801858 3.868032e+00 3.9859524
## Dim.7 Dim.8 Dim.9 Dim.10
## 1 4.903749e+00 1.993037543 3.97230614 1.163327
## 2 3.041680e-06 0.003509329 0.51578089 5.730454
## 3 2.760779e-01 2.341119409 0.01819628 4.403744
## 4 4.219434e+00 12.202893124 15.20578166 18.606797
## 5 7.276993e+00 0.033050347 1.95773638 4.299997
## 6 1.556879e-02 20.320113728 0.94692809 4.053287
fviz_pca_ind(pdata_pca, col.ind = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping (slow if many points)
)

#CLUSTERING
#Distance measure
library(NbClust)
library(cluster)
library(readr)
library(factoextra)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
protein_dist <- get_dist(pdatas[c(-1,-11)], stand = TRUE, method = "euclidean")
protein_dist
## 1 2 3 4 5 6 7
## 2 5.8532900
## 3 5.7335008 2.5465552
## 4 2.6395440 4.8216564 5.2736332
## 5 4.9602301 1.7799956 2.1747325 3.9907771
## 6 6.5671718 2.9243339 2.4533110 6.1226012 3.3677881
## 7 6.4134362 2.4670392 2.3166803 5.5758731 2.0434782 3.0724538
## 8 5.8406039 3.8406799 3.4009989 5.9283025 3.9544563 2.6085261 4.3769688
## 9 6.1742286 3.6515531 2.2906856 5.6327312 3.3007058 3.7803717 3.9557671
## 10 4.0329196 5.3381984 4.8807791 3.8018379 4.9684184 5.7608608 5.8288945
## 11 4.3239024 3.1466248 4.1046496 3.1984826 2.7068851 4.9806320 3.5696540
## 12 6.4885734 2.6325738 1.8001254 6.1127477 3.1197009 2.8111758 3.2098204
## 13 3.7498024 3.7657225 3.8543173 2.8509554 3.2837498 4.9355223 4.4583913
## 14 5.9626954 0.9435472 2.3795661 5.2254644 2.3051360 2.5648993 2.7250661
## 15 5.4457085 3.6297069 2.7290545 5.3650109 3.3873780 2.0803710 3.3671156
## 16 5.7165517 2.6614908 2.9292777 4.5788319 2.1780483 4.0106800 2.6157163
## 17 6.2976365 6.4297383 5.5858852 6.0666226 5.5122545 6.0399845 5.3182926
## 18 2.5768976 4.3374705 4.6707088 1.7047167 3.4501722 5.4683617 4.7598552
## 19 5.0680351 4.9324883 3.9949339 4.8548832 4.1112316 5.1291342 4.0217635
## 20 5.8775665 2.7862695 2.5244791 5.6151507 3.2199074 1.2081340 3.2723879
## 21 4.9529805 2.1360194 2.4473759 4.4622628 2.3992005 3.2934407 3.7035298
## 22 5.8365164 3.6063734 1.8556264 5.8029350 3.5937897 3.2081924 3.8935655
## 23 3.9609658 3.7540429 3.1261489 3.6240876 2.5115136 4.1334745 3.5282693
## 24 5.9637845 1.3409333 1.7791191 5.3816377 2.0187736 2.5454973 2.1102420
## 25 2.7755336 5.5218480 5.8338373 2.2491049 4.5844276 6.5295053 5.8001602
## 8 9 10 11 12 13 14
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9 4.6150744
## 10 5.7097979 4.7789613
## 11 5.3333012 5.1053974 4.2850275
## 12 3.2738587 3.3914878 5.6925883 4.6863290
## 13 5.0082418 3.8955518 2.3596802 3.1908949 4.7625440
## 14 3.5970486 3.3934201 5.1710323 3.4972572 2.3143576 3.8779632
## 15 2.2863117 3.8544030 4.6533776 4.6658740 3.6094330 3.9903428 3.3754514
## 16 4.3624011 3.4793814 4.4735805 3.0398080 3.7489430 3.0752251 2.9280510
## 17 6.6588227 5.5547807 4.8491900 5.6449530 7.1258298 4.7856955 6.3625595
## 18 5.1049726 5.4851406 3.6525991 2.2424874 5.4337154 2.9208731 4.6556697
## 19 5.4719510 4.5530910 3.2684234 3.9217489 5.2207565 3.1312167 4.8673242
## 20 2.0789731 3.9242956 5.3090340 4.7118733 2.7550599 4.3452517 2.4692299
## 21 3.4562862 2.5274841 4.3266802 3.8073662 2.7913200 2.9446793 1.8966361
## 22 3.6483965 3.0886557 5.0054744 5.0677943 1.8327677 4.3768934 3.2690262
## 23 3.5333342 4.1946705 4.2792677 3.2020283 4.0520695 3.4402756 3.9720243
## 24 3.7124018 3.0931434 5.2666612 3.5425016 2.0307766 3.9512746 0.9563501
## 25 5.9951737 6.4962265 4.1573776 3.1437121 6.6414965 3.8357345 5.8241028
## 15 16 17 18 19 20 21
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16 3.5553930
## 17 4.6204437 4.7919170
## 18 4.5736176 3.8870793 5.5049065
## 19 3.7708759 3.3382969 2.8830924 4.0166916
## 20 1.7036872 3.9105822 6.0249236 4.9829920 4.8591410
## 21 3.3356945 2.9812230 6.0908143 4.2030576 4.6396518 2.8701823
## 22 3.4836698 4.3952070 6.7248921 5.3323892 4.8209640 2.9525189 3.0135860
## 23 3.1361796 2.8552498 4.9740917 2.5412637 3.5301244 3.8416304 3.5101872
## 24 3.2425576 2.8341173 6.0487449 4.7272855 4.4960662 2.5244346 2.1234063
## 25 5.4611055 4.7532526 5.6772596 1.3353611 4.5772131 6.0815743 5.3301075
## 22 23 24
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23 4.1110165
## 24 2.8810687 3.7754617
## 25 6.5416937 3.3087134 5.9103892
#The result is a symmetric matrix where each element reflects the Euclidean distance between two rows in pdatas.
protein_sn <- hclust(protein_dist, method = "single")
plot(protein_sn, hang=-1,xlab="Object",ylab="Distance",
main="Dendrogram. Nearest neighbor linkage")

#This particular dendrogram uses the "single" linkage approach, which joins two clusters based on the shortest distance between their members.
#least distance betwweb 2 and 14 so they form the first cluster then 24 adds to the cluster
#The vertical height of each branch in the dendrogram indicates the distance between the combined groups. Items that are clustered at a lower height are more similar to one another than those that are gathered at a higher height.
protein_fn <- hclust(protein_dist)
plot(protein_fn, hang=-1,xlab="Object",ylab="Distance",
main="Dendrogram. Farthest neighbor linkage")

#This particular dendrogram uses distance matrix using the farthest neighbor linkage method
protein_av <- hclust(protein_dist)
plot(protein_av, hang=-1,xlab="Object",ylab="Distance",
main="Dendrogram. Group average linkage")

#It is evident that all three approaches produce the same cluster formation sequence and virtually identical clusters.If all three hierarchical clustering methods (single, complete, and average linkage) generate the same dendrogram, the generated clusters are likely significant and representative of the data's true underlying structure
plot(as.dendrogram(protein_sn),ylab="Distance between each datapoint",ylim=c(0,10))

plot(as.dendrogram(protein_fn),ylab="Distance between each datapoint",ylim=c(0,10))

#The ylim parameter value of 10 indicates that the maximum distance between any two data points is 4, and all distances are scaled to fit inside this range.
#Data Scaling
matstd_protein <- scale(pdatas[c(-1,-11)])
matstd_protein
## Red Meat White Meat Egg Milk Fish Cereals
## [1,] 0.05876425 -1.84988830 -1.86538958 -1.16658295 -1.23330478 0.8791769
## [2,] -0.23505701 1.62533538 0.82507616 0.38322532 -0.65699414 -0.3923599
## [3,] 1.23404931 0.28871089 0.82507616 0.10144200 0.20747183 -0.4831840
## [4,] -0.52887828 -0.51326380 -0.96856767 -1.30747461 -0.94514946 2.2415378
## [5,] 0.05876425 0.82336069 -0.07174575 -0.60301630 -0.65699414 0.1525844
## [6,] 0.35258552 0.82336069 0.82507616 1.08768362 1.64824845 -0.9373043
## [7,] -0.52887828 1.09068558 0.82507616 -0.88479963 0.20747183 -0.6648321
## [8,] 0.05876425 -0.78058870 -0.07174575 2.35570856 0.49562716 -0.5740081
## [9,] 2.40933437 0.55603579 -0.07174575 0.38322532 0.49562716 -0.3923599
## [10,] 0.05876425 -1.31523850 -0.07174575 0.10144200 0.49562716 0.8791769
## [11,] -1.41034207 1.09068558 -0.07174575 -1.02569129 -1.23330478 0.6975288
## [12,] 1.23404931 0.55603579 1.72189807 1.22857528 -0.65699414 -0.7556562
## [13,] -0.23505701 -0.78058870 -0.07174575 -0.46212464 -0.36883881 0.4250566
## [14,] 0.05876425 1.62533538 0.82507616 0.80590030 -0.36883881 -0.9373043
## [15,] -0.23505701 -0.78058870 -0.07174575 0.80590030 1.64824845 -0.8464803
## [16,] -0.82269954 0.55603579 -0.07174575 0.24233366 -0.36883881 0.3342325
## [17,] -1.11652080 -1.04791360 -1.86538958 -1.73014959 2.80086974 -0.4831840
## [18,] -1.11652080 -0.51326380 -0.96856767 -0.88479963 -0.94514946 1.6057694
## [19,] -0.82269954 -1.31523850 -0.07174575 -1.16658295 0.78378248 -0.3015359
## [20,] 0.05876425 0.02138599 0.82507616 1.08768362 1.07193780 -1.1189524
## [21,] 0.94022805 0.55603579 -0.07174575 0.94679196 -0.65699414 -0.5740081
## [22,] 2.11551310 -0.51326380 1.72189807 0.52411698 -0.08068349 -0.7556562
## [23,] -0.23505701 -0.78058870 -0.96856767 -0.03944966 -0.36883881 1.0608250
## [24,] 0.35258552 1.35801048 0.82507616 0.24233366 -0.36883881 -1.2097765
## [25,] -1.70416333 -0.78058870 -1.86538958 -1.02569129 -0.94514946 2.1507138
## Starchy Foods Pulses Nuts and Oilseeds Fruits and Vegetables
## [1,] -2.0298502 1.44620630 -1.1489125
## [2,] -0.2174840 -1.03017435 -0.1044466
## [3,] 0.9907602 -0.53489822 -0.1044466
## [4,] -2.0298502 0.45565404 -0.1044466
## [5,] 0.3866381 -1.03017435 -0.1044466
## [6,] 0.3866381 -1.03017435 -1.1489125
## [7,] 1.5948823 -1.03017435 -0.1044466
## [8,] 0.3866381 -1.03017435 -1.6711455
## [9,] 0.3866381 -0.53489822 1.4622523
## [10,] -1.4257281 2.43675857 1.4622523
## [11,] -0.2174840 0.95093017 -0.1044466
## [12,] 0.9907602 -0.53489822 -0.6266796
## [13,] -1.4257281 0.45565404 1.4622523
## [14,] -0.2174840 -0.53489822 -0.1044466
## [15,] 0.3866381 -0.53489822 -0.6266796
## [16,] 0.9907602 -0.53489822 1.4622523
## [17,] 0.9907602 0.95093017 1.9844853
## [18,] -0.8216060 0.95093017 -0.6266796
## [19,] 0.9907602 1.44620630 1.4622523
## [20,] -0.2174840 -1.03017435 -1.1489125
## [21,] -0.8216060 -0.53489822 0.4177864
## [22,] 0.3866381 -0.03962209 -0.6266796
## [23,] 0.9907602 -0.03962209 -0.6266796
## [24,] 0.3866381 -0.53489822 -0.1044466
## [25,] -0.8216060 1.44620630 -0.6266796
## attr(,"scaled:center")
## Red Meat White Meat Egg
## 9.80 7.92 3.08
## Milk Fish Cereals
## 17.28 4.28 32.32
## Starchy Foods Pulses Nuts and Oilseeds Fruits and Vegetables
## 4.36 3.08 4.20
## attr(,"scaled:scale")
## Red Meat White Meat Egg
## 3.403430 3.740766 1.115049
## Milk Fish Cereals
## 7.097652 3.470351 11.010298
## Starchy Foods Pulses Nuts and Oilseeds Fruits and Vegetables
## 1.655295 2.019076 1.914854
#Kmeans
km.res <- kmeans(matstd_protein,3, nstart = 25)
km.res
## K-means clustering with 3 clusters of sizes 15, 8, 2
##
## Cluster means:
## Red Meat White Meat Egg Milk Fish Cereals
## 1 0.4701140 0.5203925 0.5859237 0.5804736 0.1306304 -0.6103377
## 2 -0.6390612 -0.6803419 -0.8564649 -0.7262965 -0.6930136 1.2424732
## 3 -0.9696102 -1.1815761 -0.9685677 -1.4483663 1.7923261 -0.3923599
## Starchy Foods Pulses Nuts and Oilseeds Fruits and Vegetables
## 1 0.3866381 -0.6999903 -0.20889319
## 2 -0.9726366 1.0128397 -0.03916747
## 3 0.9907602 1.1985682 1.72336879
##
## Clustering vector:
## [1] 2 1 1 2 1 1 1 1 1 2 2 1 2 1 1 1 3 2 3 1 1 1 2 1 2
##
## Within cluster sum of squares by cluster:
## [1] 63.079540 37.801856 4.156111
## (between_SS / total_SS = 51.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#Here, the k-means method clustered the data into three clusters, with sums of squares within each cluster ranging from 4.16 to 63.08.
# Determining the optimal numbers of Clusters
fviz_nbclust(matstd_protein, kmeans, method = "gap_stat")

fviz_nbclust <- function (x, FUNcluster = NULL, method = c("silhouette", "wss",
"gap_stat"), diss = NULL, k.max = 10, nboot = 100, verbose = interactive(),
barfill = "steelblue", barcolor = "steelblue", linecolor = "steelblue",
print.summary = TRUE, ...)
{
set.seed(123)
if (k.max < 2)
stop("k.max must bet > = 2")
method = match.arg(method)
if (!inherits(x, c("data.frame", "matrix")) & !("Best.nc" %in%
names(x)))
stop("x should be an object of class matrix/data.frame or ",
"an object created by the function NbClust() [NbClust package].")
if (inherits(x, "list") & "Best.nc" %in% names(x)) {
best_nc <- x$Best.nc
if (any(class(best_nc) == "numeric") )
print(best_nc)
else if (any(class(best_nc) == "matrix") )
.viz_NbClust(x, print.summary, barfill, barcolor)
}
else if (is.null(FUNcluster))
stop("The argument FUNcluster is required. ", "Possible values are kmeans, pam, hcut, clara, ...")
else if (!is.function(FUNcluster)) {
stop("The argument FUNcluster should be a function. ",
"Check if you're not overriding the specified function name somewhere.")
}
else if (method %in% c("silhouette", "wss")) {
if (is.data.frame(x))
x <- as.matrix(x)
if (is.null(diss))
diss <- stats::dist(x)
v <- rep(0, k.max)
if (method == "silhouette") {
for (i in 2:k.max) {
clust <- FUNcluster(x, i, ...)
v[i] <- .get_ave_sil_width(diss, clust$cluster)
}
}
else if (method == "wss") {
for (i in 1:k.max) {
clust <- FUNcluster(x, i, ...)
v[i] <- .get_withinSS(diss, clust$cluster)
}
}
df <- data.frame(clusters = as.factor(1:k.max), y = v,
stringsAsFactors = TRUE)
ylab <- "Total Within Sum of Square"
if (method == "silhouette")
ylab <- "Average silhouette width"
p <- ggpubr::ggline(df, x = "clusters", y = "y", group = 1,
color = linecolor, ylab = ylab, xlab = "Number of clusters k",
main = "Optimal number of clusters")
if (method == "silhouette")
p <- p + geom_vline(xintercept = which.max(v), linetype = 2,
color = linecolor)
return(p)
}
else if (method == "gap_stat") {
extra_args <- list(...)
gap_stat <- cluster::clusGap(x, FUNcluster, K.max = k.max,
B = nboot, verbose = verbose, ...)
if (!is.null(extra_args$maxSE))
maxSE <- extra_args$maxSE
else maxSE <- list(method = "firstSEmax", SE.factor = 1)
p <- fviz_gap_stat(gap_stat, linecolor = linecolor,
maxSE = maxSE)
return(p)
}
}
.viz_NbClust <- function (x, print.summary = TRUE, barfill = "steelblue",
barcolor = "steelblue")
{
best_nc <- x$Best.nc
if (any(class(best_nc) == "numeric") )
print(best_nc)
else if (any(class(best_nc) == "matrix") ) {
best_nc <- as.data.frame(t(best_nc), stringsAsFactors = TRUE)
best_nc$Number_clusters <- as.factor(best_nc$Number_clusters)
if (print.summary) {
ss <- summary(best_nc$Number_clusters)
cat("Among all indices: \n===================\n")
for (i in 1:length(ss)) {
cat("*", ss[i], "proposed ", names(ss)[i],
"as the best number of clusters\n")
}
cat("\nConclusion\n=========================\n")
cat("* According to the majority rule, the best number of clusters is ",
names(which.max(ss)), ".\n\n")
}
df <- data.frame(Number_clusters = names(ss), freq = ss,
stringsAsFactors = TRUE)
p <- ggpubr::ggbarplot(df, x = "Number_clusters",
y = "freq", fill = barfill, color = barcolor) +
labs(x = "Number of clusters k", y = "Frequency among all indices",
title = paste0("Optimal number of clusters - k = ",
names(which.max(ss))))
return(p)
}
}
res.nbclust <- pdatas[c(-1,-11)] %>% scale() %>% NbClust(distance = "euclidean", min.nc = 2, max.nc = 10, method = "complete", index ="all")
## Warning in pf(beale, pp, df2): NaNs produced
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 6 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 3 proposed 7 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 5 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
fviz_nbclust(res.nbclust, ggtheme = theme_minimal())
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 1 proposed 1 as the best number of clusters
## * 6 proposed 2 as the best number of clusters
## * 6 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 3 proposed 7 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 5 proposed 10 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 2 .

pam.res <- pam(matstd_protein, 2)
# Visualize
fviz_cluster(pam.res)

res.hc <- matstd_protein %>% scale() %>% dist(method = "euclidean") %>%
hclust(method = "ward.D2")
fviz_dend(res.hc, k = 2, # Cut in four groups
cex = 0.5, # label size
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE, # color labels by groups
rect = TRUE # Add rectangle around groups
)
## Warning in get_col(col, k): Length of color vector was longer than the number
## of clusters - first k elements are used
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.

#Factor Analysis
fit.pc <- principal(pdatas[-1], nfactors=4, rotate="varimax")
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## In factor.stats, I could not find the RMSEA upper bound . Sorry about that
## Warning in principal(pdatas[-1], nfactors = 4, rotate = "varimax"): The matrix
## is not positive semi-definite, scores found from Structure loadings
fit.pc
## Principal Components Analysis
## Call: principal(r = pdatas[-1], nfactors = 4, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 RC3 RC4 h2 u2 com
## Red Meat 0.26 0.21 0.74 -0.11 0.67 0.335 1.5
## White Meat 0.94 -0.16 0.05 0.03 0.91 0.092 1.1
## Egg 0.72 0.21 0.43 -0.15 0.77 0.233 1.9
## Milk 0.32 0.26 0.68 -0.49 0.87 0.127 2.6
## Fish -0.15 0.92 0.03 0.11 0.88 0.121 1.1
## Cereals -0.58 -0.71 -0.17 0.18 0.90 0.096 2.2
## Starchy Foods 0.53 0.60 -0.05 0.13 0.66 0.336 2.1
## Pulses Nuts and Oilseeds -0.75 -0.24 -0.21 0.37 0.80 0.196 1.9
## Fruits and Vegetables -0.07 0.15 0.01 0.95 0.93 0.075 1.1
## Total -0.03 -0.25 0.86 0.18 0.83 0.166 1.3
##
## RC1 RC2 RC3 RC4
## SS loadings 2.77 2.04 2.00 1.41
## Proportion Var 0.28 0.20 0.20 0.14
## Cumulative Var 0.28 0.48 0.68 0.82
## Proportion Explained 0.34 0.25 0.24 0.17
## Cumulative Proportion 0.34 0.59 0.83 1.00
##
## Mean item complexity = 1.7
## Test of the hypothesis that 4 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.07
## with the empirical chi square 10.96 with prob < 0.45
##
## Fit based upon off diagonal values = 0.97
## RC1, RC2, RC3, and RC4 are the factor loadings for each of the thirteen variables (columns) in the dataset.
#
# h2 refers to the communality of each variable(explained variance)
#
# u2 refers to the uniqueness of each variable(unexplained variance)
#
#com represents the entire proportion of variance in each variable(explained+unexplained variance).
#In this case, the first factor explains 34% of the variance, the second factor explains 25%, the third factor explains 24%, and the fourth factor explains 217%. Together, the four factors explain 100% of the variance.
#mean item complexity is 1.7, which suggests that the factor structure is moderately complex for each variable which further implies that the relationships between the variables and the underlying factors are not entirely straightforward or simple.
round(fit.pc$values, 3)
## [1] 4.130 1.740 1.309 1.044 0.699 0.427 0.341 0.191 0.120 0.000
fit.pc$loadings
##
## Loadings:
## RC1 RC2 RC3 RC4
## Red Meat 0.259 0.213 0.735 -0.111
## White Meat 0.937 -0.161
## Egg 0.720 0.208 0.426 -0.154
## Milk 0.316 0.261 0.684 -0.488
## Fish -0.148 0.918 0.113
## Cereals -0.579 -0.714 -0.167 0.176
## Starchy Foods 0.531 0.602 0.131
## Pulses Nuts and Oilseeds -0.752 -0.240 -0.205 0.373
## Fruits and Vegetables 0.153 0.947
## Total -0.246 0.860 0.183
##
## RC1 RC2 RC3 RC4
## SS loadings 2.773 2.040 2.005 1.405
## Proportion Var 0.277 0.204 0.200 0.141
## Cumulative Var 0.277 0.481 0.682 0.822
#RC1 is associated with White Meat, Egg, and Pulses Nuts and Oilseeds,and Milk while RC2 is associated with White Meat, Fish, and Total. RC3 is associated with Fruits and Vegetables, and RC4 is associated with Egg.
# Loadings with more digits
for (i in c(1,3,2,4)) { print(fit.pc$loadings[[1,i]])}
## [1] 0.2589173
## [1] 0.7352675
## [1] 0.2126479
## [1] -0.1108784
#loadings for "Red Meat" on RC1 is 0.259, ON RC2 is 0.213 on RC3 is 0.937 and on RC4 is -0.111.
# Communalities
fit.pc$communality
## Red Meat White Meat Egg
## 0.6651696 0.9078091 0.7669612
## Milk Fish Cereals
## 0.8730259 0.8789782 0.9035506
## Starchy Foods Pulses Nuts and Oilseeds Fruits and Vegetables
## 0.6638440 0.8043733 0.9250830
## Total
## 0.8340405
comm_sorted <- sort(fit.pc$communality, decreasing = TRUE)
comm_sorted_names <- names(fit.pc$communality)[order(-fit.pc$communality)]
# Rotated factor scores
fit.pc$scores
## RC1 RC2 RC3 RC4
## [1,] -5.7605734 -3.36700491 -3.8633784 -0.39909495
## [2,] 3.1694665 -0.25158518 0.7561406 -0.89662602
## [3,] 2.3868189 1.56149347 1.8337942 -0.46124833
## [4,] -4.3208782 -4.55896299 -1.5315590 1.04713255
## [5,] 1.5578630 -0.41964941 -0.6288392 -0.24291402
## [6,] 3.1399401 2.71500531 2.3853571 -1.94188415
## [7,] 3.2301617 1.84901437 -1.5130562 -0.21860020
## [8,] 1.3146712 1.65752382 2.4667399 -3.00560097
## [9,] 1.8007806 1.36671635 3.8271586 1.14045289
## [10,] -4.5810717 -1.61817397 1.0655364 2.56985982
## [11,] -0.7455549 -2.64356726 -2.4704373 0.82593410
## [12,] 3.9476155 1.21073039 3.3976235 -1.69957990
## [13,] -2.3770340 -1.36214582 -0.9545821 1.57348514
## [14,] 3.2792689 0.45581557 1.2593824 -1.01402637
## [15,] 0.3278365 2.77552853 0.1615539 -1.17661290
## [16,] 0.9782808 -0.08782629 0.4161197 1.51507925
## [17,] -3.5783940 3.05623951 -4.2281851 3.53676835
## [18,] -3.6436378 -3.44529927 -2.2388510 0.53509931
## [19,] -2.4289034 1.46500473 -2.9477324 2.46464265
## [20,] 2.2286387 2.34569499 1.0237541 -2.35625096
## [21,] 1.3670756 -0.21504380 1.7800778 -0.57852957
## [22,] 2.1943713 1.57880111 2.9414916 -1.42721666
## [23,] -1.4884026 -0.92805896 -0.1658179 -0.01163555
## [24,] 3.4360982 1.19111164 0.3305165 -0.91145006
## [25,] -5.4344376 -4.33136193 -3.1028077 1.13281655
# Play with FA utilities
fa.parallel(pdatas[-1]) # See factor recommendation
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## In factor.scores, the correlation matrix is singular, the pseudo inverse is used
## I was unable to calculate the factor score weights, factor loadings used instead

## Parallel analysis suggests that the number of factors = 1 and the number of components = 1
fa.plot(fit.pc)

fa.diagram(fit.pc)

# The length of the arrow represents the strength of the loading, and the color indicates the sign of the loading (positive or negative).
# The plot shows how the variables in the factor analysis model are related to each other and which variables are most strongly associated to each factor.
#White meat and egg have a strong positive loading on RC1 while Pulses nut and oilseeds is having a negative loading on RC1
#Fish and starchy foods have a strong positive loading on RC2 while Cereals is having a negative loading on RC2
#Total and Red Meat and Milk have a strong positive loading on RC3
#Fruit vegetables have a strong positive loading on RC3
vss(pdatas[-1])
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in cor.smooth(r): The estimated weights for the factor scores are
## probably incorrect. Try a different factor score estimation method.
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in cor.smooth(r): The estimated weights for the factor scores are
## probably incorrect. Try a different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## In factor.stats, I could not find the RMSEA upper bound . Sorry about that
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## In factor.stats, I could not find the RMSEA upper bound . Sorry about that
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## An ultra-Heywood case was detected. Examine the results carefully
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully

##
## Very Simple Structure
## Call: vss(x = pdatas[-1])
## VSS complexity 1 achieves a maximimum of 0.71 with 1 factors
## VSS complexity 2 achieves a maximimum of 0.87 with 3 factors
##
## The Velicer MAP achieves a minimum of 0.09 with 1 factors
## BIC achieves a minimum of 306.49 with 5 factors
## Sample Size adjusted BIC achieves a minimum of 321.99 with 5 factors
##
## Statistics by number of factors
## vss1 vss2 map dof chisq prob sqresid fit RMSEA BIC SABIC complex
## 1 0.71 0.00 0.087 35 453 3.6e-74 6.94 0.71 0.69 341 449 1.0
## 2 0.60 0.82 0.106 26 419 2.0e-72 4.25 0.82 0.78 335 416 1.3
## 3 0.60 0.87 0.145 18 390 1.0e-71 2.60 0.89 0.91 332 388 1.6
## 4 0.53 0.78 0.173 11 356 1.1e-69 1.28 0.95 1.12 321 355 1.7
## 5 0.55 0.81 0.209 5 323 1.4e-67 0.70 0.97 1.59 306 322 1.8
## 6 0.50 0.77 0.314 0 294 NA 0.61 0.97 NA NA NA 2.1
## 7 0.45 0.67 0.477 -4 277 NA 0.40 0.98 NA NA NA 2.2
## 8 0.40 0.62 1.000 -7 250 NA 0.12 0.99 NA NA NA 2.3
## eChisq SRMR eCRMS eBIC
## 1 56.2527 0.1581 0.179 -56
## 2 27.4318 0.1104 0.145 -56
## 3 12.8827 0.0757 0.120 -45
## 4 4.6981 0.0457 0.092 -31
## 5 0.9579 0.0206 0.062 -15
## 6 0.3033 0.0116 NA NA
## 7 0.0199 0.0030 NA NA
## 8 0.0031 0.0012 NA NA
#The output implies that the optimum number of factors is between 4 and 5.